package com.uusee.crawler.pageprocessor.video.ku6;

import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import org.apache.oro.text.regex.MatchResult;
import org.apache.oro.text.regex.Pattern;
import org.apache.oro.text.regex.PatternCompiler;
import org.apache.oro.text.regex.PatternMatcher;
import org.apache.oro.text.regex.PatternMatcherInput;
import org.apache.oro.text.regex.Perl5Compiler;
import org.apache.oro.text.regex.Perl5Matcher;

import com.uusee.crawler.util.PageProcessorUtils;
import com.uusee.shipshape.gt.model.UploadUser;

public class Ku6UploadUserTopPageProcessor {
	private static final String UPLOAD_USER_REG = "<dt><a[^<]*?href=\"(http://v.ku6.com/([^<]*?))\"[^<]*?title=\"([^<]*?)\"[^<]*?><img src=\"([^<]*?)\"[^<]*?/></a>.*?</dt>";
	
	public List<UploadUser> process(String url,String pageHtml) {
		List<UploadUser> uploadUserList = new ArrayList<UploadUser>();
		try {
			PatternCompiler compiler = new Perl5Compiler();
			Pattern pattern = compiler.compile(UPLOAD_USER_REG, Perl5Compiler.CASE_INSENSITIVE_MASK);
			PatternMatcher matcher = new Perl5Matcher();
			PatternMatcherInput input = new PatternMatcherInput(pageHtml);
			while (matcher.contains(input, pattern)) {
				try {
					MatchResult m = matcher.getMatch();
					String userBlogUrl = m.group(1);
					String userId = m.group(2);
					String username = PageProcessorUtils.unicodeToChinese(m.group(3));
					String logo = m.group(4);
					
					UploadUser uploadUser = new UploadUser();
					uploadUser.setSourceSite("www.ku6.com");
					uploadUser.setUserId(userId);
					uploadUser.setUsername(username);
					uploadUser.setUserBlogUrl(userBlogUrl);
					uploadUser.setLogo(logo);
					uploadUser.setUpdateUser("crawler");
					uploadUser.setUpdateDate(new Date());
					
					uploadUserList.add(uploadUser);
				}catch(Exception e){
					
				}
			}
			return uploadUserList;
		} catch (Exception e) {
			throw new RuntimeException("失败。");
		}
	}

}
